from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import to_date
sc = SparkContext()
spark = SQLContext(sc)
df1 =spark.read.csv(r'C:\Users\Wang\Desktop\data\stock_small.csv',header=True)
df = df1.iloc[:, [ 1, 2, 8]]#<股票代码><年月日> <调整后收盘价>
df['date'] = to_date(df['date'])
df['date.y'] = df['date'].dt.year
data=df[df['stock_symbol']== 'AAPL']
df2=data.groupby('date.y',as_index=False).mean()['stock_price_adj_close']
df2.columns=['年份','年平均调整后收盘价']
print(df2[df2['年平均调整后收盘价']>=50])


